********************************************************************************

* Filename: Table_1_Attrition.do

********************************************************************************
*  This do-file is part of the collection of replication files for Schaffner, Glewwe, and Sharma, 
*  "Why Programs Fail:  Lessons for Improving Public Service Quality from a Mixed Methods Evaluation
*   of an Unsuccessful Teacher Training Program in Nepal" World Bank Economic Review
*
*This do-file calculates:
*      - all student attrition statistics shown in Table 1    
*
* Software version used: STATA/SE 18.0
********************************************************************************
*GLOBAL FILE PATH DEFINITIONS
	global PBRfolder "ADD FILE REFERENCE TO MAIN FOLDER HERE" 
	global datasets = "$PBRfolder\Datasets"
	global logs = "$PBRfolder\Logs"

********************************************************************************
* SET-UP
	set more off
	clear all
	set varabbrev on 
	capture log close
	log using "$logs\Table_1_calcs", replace
	cd "$datasets"
********************************************************************************

**  Calculation of Table 1 student attrition statistics for students in grade 8 at baseline

********************************************************************************

	* Read in grade 9 student tracking data from endline
		use stu_tracking09_endline, clear
		
	* Merge with study arm and sample design information
		sort schoolid
		merge m:1 schoolid using basicdata  // 0 unmatched
		gen treat=studyarm==1 | studyarm==2
		gen dist_stratum=district*10+stratum
			
	* Clean up variables 
		rename C8_T_AGE age
		replace age=. if age==99
		rename IS_THIS_STUDENT_PRESENT_IN_GRADE qpresence
		gen present_endline=qpresence>=1 & qpresence<=4 if qpresence~=.
		rename WHAT_IS_THE_STUDENT_S_STATUS_IN notpresent_status
	
	* Keep only if baseline student
		drop if present_endline==. //1793
		
	* Create indicators of attrition by reason
		gen attrit_all=1-present_endline if present_endline~=.
		replace notpresent_status=. if notpresent_status==6  
		gen absent=notpresent_status==1     // Enrolled in grade but absent
		gen repeated=notpresent_status==2   // in one grade lower in the school
		gen moved=notpresent_status==3      // moved to another government school
		gen private=notpresent_status==4    // moved to a private school 
		gen dropout=notpresent_status==5    // Not enrolled in school
		gen unknown =notpresent_status==. & present_endline==0   // don't know what became of them
		
		gen no_classes = 0  // Classes not in session for 10th graders (not relevant for grade 8 at baseline)
		gen attrit_other = repeated==1 | moved==1 | private==1 | unknown==1  // Repeating previous grade, moved to another school, or unknown
		
		* Use attrit_all, absent, dropout, no_classes, and attrit_other for Table 1
		
	* Numbers of students for Table 1, grade 8
		tab treat  // 3906 control, 3743 treatment
		
	*  Attritters as unweighted percent of baseline students, by reasons for attrition, for grade 8 baseline students, Table 1
		tab treat, summ(attrit_all)
		tab treat, summ(absent)
		tab treat, summ(dropout)
		tab treat, summ(no_classes)
		tab treat, summ(attrit_other)
		
	* Test of equality of overall attrition rate 
		svyset schoolid [pweight=sch_wght], strata(dist_stratum)
		svy: reg attrit_all treat district#stratum 
		test treat  
		
	* Save attrition data
		tempfile attrition
		save "`attrition'", replace

	* Get and process math score data
		
		use Math8and9_panel_IRT , clear
		keep stu_serial schoolid theta_panel_g8_9 test
		keep if test=="Math08" //6801 deleted
		*sort schoolid
		sort stu_serial
		unique stu_serial
		tempfile scores
		save "`scores'", replace  
		
		* read in student attrition data, merge in the scores
			use "`attrition'", clear
			sort stu_serial
			drop _merge
			merge 1:1 stu_serial using "`scores'"  //2 unmatched (2 using)
			keep if _merge==3
			drop _merge
		
		* rename math test score 
			rename theta_panel_g8_9 score
			count if missing(score) // 0
			summ score, detail
			
		
		* standardize scores (subtracting off control sample mean and dividing by control sample standard deviation)
			svy, over(treat): mean score
			mat b= e(b)
			scalar mean=b[1,1]
			di mean
			estat sd
			mat sd = r(sd)
			scalar sd= sd[1,1]
			di sd
			gen score_std = (score-mean)/sd
					
		* average standardized math test scores among attritters by treatment/control 
			bys treat attrit_all: summ(score_std)  // -0.16  -0.29
	
		* Test of equality of baseline math test score average among attritters
			svy: reg score_std treat district#stratum if attrit_all==1  
			test treat   // p-val 0.0628
			
					
	* Get and process science score data 
		
		use Sci8and9_panel_IRT , clear
		keep stu_serial schoolid theta_panel_g8_9 test
		rename theta_panel_g8_9 theta_sci
		keep if test=="Sci08" //6801 deleted
		sort schoolid
		unique stu_serial  //7651 unique
		tempfile scores
		save "`scores'", replace 
		
		* read in student attrition data, merge in the scores
			use "`attrition'", clear
			drop _merge
			sort stu_serial
			merge 1:1 stu_serial using "`scores'" //2 unmatched (2 using)
			keep if _merge==3 //2
			drop _merge
				
		* rename science test score
			rename theta_sci score_s
			count if missing(score_s) // 0
			summ score_s, detail
					
		* standardize scores (subtracting off control sample mean and dividing by control sample standard deviation)
			svy, over(treat): mean score_s
			mat b= e(b)
			scalar mean=b[1,1]
			di mean
			estat sd
			mat sd = r(sd)
			scalar sd= sd[1,1]
			di sd
			gen score_s_std = (score_s-mean)/sd			
					
		* average standardized science test scores among attritters by treatment/control
			bys treat attrit_all: summ(score_s_std)  // -.257  -.391

		* Test of equality of baseline math test score average among attritters
			svy: reg score_s_std treat district#stratum if attrit_all==1  
			test treat   // p-val 0.141
			
********************************************************************************

**  Calculation of Table 1 student attrition statistics for students in grade 9 at baseline

********************************************************************************
	* Read in grade 10 student tracking data from endline
		use stu_tracking10_endline, clear
		
	* Merge with study arm and sample design information
		sort schoolid
		merge m:1 schoolid using basicdata  // 13 unmatched
		keep if _merge==3
		gen treat=studyarm==1 | studyarm==2
		gen dist_stratum=district*10+stratum
			
	* Clean up variables 
		rename C9_T_AGE age
		replace age=. if age==99
		rename IS_THIS_STUDENT_PRESENT_IN_GRADE qpresence
		gen present_endline=qpresence>=1 & qpresence<=4 if qpresence~=.
		rename WHAT_IS_THE_STUDENT_S_STATUS_IN notpresent_status
	
	* Keep only if baseline student
		drop if present_endline==. //830
	
		
	* Create indicators of attrition by reason
		gen attrit_all=1-present_endline if present_endline~=.
		replace notpresent_status=. if notpresent_status==6  
		gen absent=notpresent_status==1     // Enrolled in grade but absent
		gen repeated=notpresent_status==2   // in one grade lower in the school
		gen moved=notpresent_status==3      // moved to another government school
		gen private=notpresent_status==4    // moved to a private school 
		gen dropout=notpresent_status==5    // Not enrolled in school
		gen unknown =notpresent_status==. & present_endline==0   // don't know what became of them
		
		gen no_classes = 0  // Classes not in session for 10th graders (not relevant for grade 8 at baseline)
		gen attrit_other = repeated==1 | moved==1 | private==1 | unknown==1  // Repeating previous grade, moved to another school, or unknown
		
		tempfile partial
		save "`partial'", replace
		
		* Use attrit_all, absent, dropout, no_classes, and attrit_other for Table 1
		
	* Create observations for students in the 13 schools for which there were no tracking sheets at endline 
	* (because the school was not in session for 10th graders)  Set their attrit to 1 and their their "not in session" status to 1.
	* The test score datasets used below don't include schoolid, so we must do this using the baseline questionnaire data.
		
		* create dataset of students from baseline in 13 schools for which no tracking sheets were collected
			use Grade09baseline, clear
			rename serial stu_serial
			sort schoolid
			merge m:1 schoolid using basicdata //0 unmatched
			keep if distname=="Jumla" | schoolid==1411217 //8249 deleted
			gen treat = studyarm==1 | studyarm==2
			gen dist_stratum=district*10+stratum
		
			gen no_classes=1
			gen attrit_all=1
			gen absent=0
			gen dropout=0
			gen attrit_other=0
			keep schoolid stu_serial distname attrit_all no_classes absent dropout attrit_other studyarm sch_wght treat dist_stratum
			tempfile no_classes
			save "`no_classes'", replace
			
		* append the students in schools that were not in session to the main dataset for baseline grade 9 students
		
			use "`partial'", clear
			unique schoolid  //190
			append using "`no_classes'"
			unique schoolid  // 203
			
	* Numbers of students for Table 1, grade 9
		tab treat  // 4614 control, 4167 treatment
		
	*  Attritters as unweighted percent of baseline students, by reasons for attrition, for grade 8 baseline students, Table 1
		tab treat, summ(attrit_all)
		tab treat, summ(absent)
		tab treat, summ(dropout)
		tab treat, summ(no_classes)
		tab treat, summ(attrit_other)
		
	* Test of equality of overall attrition rate 
		svyset schoolid [pweight=sch_wght], strata(dist_stratum)
		svy: reg attrit_all treat district#stratum 
		test treat  
		
	* Save attrition data
		tempfile attrition
		save "`attrition'", replace

	* Get and process math score data
		
		use Math9and10_panel_IRT , clear
		keep stu_serial schoolid theta_panel_g9_10 test
		keep if test=="Math09" //5833 deleted
		*sort schoolid
		sort stu_serial
		unique stu_serial //8784
		tempfile scores
		save "`scores'", replace  
		
		* read in student attrition data, merge in the scores
			use "`attrition'", clear
			sort stu_serial
			drop _merge
			merge 1:1 stu_serial using "`scores'" //9 unmatched (3 master, 6 using)
			keep if _merge==3
			drop _merge
		
		* rename math test score 
			rename theta_panel_g9_10 score
			count if missing(score) // 0
			summ score, detail
						
		* standardize scores (subtracting off control sample mean and dividing by control sample standard deviation)
			svy, over(treat): mean score
			mat b= e(b)
			scalar mean=b[1,1]
			di mean
			estat sd
			mat sd = r(sd)
			scalar sd= sd[1,1]
			di sd
			gen score_std = (score-mean)/sd
					
		* average standardized math test scores among attritters by treatment/control 
			bys treat attrit_all: summ(score_std)  // -0.25  -0.26
	
		* Test of equality of baseline math test score average among attritters
			svy: reg score_std treat district#stratum if attrit_all==1 
			test treat   // p-val 0.234
								
	* Get and process science score data 
		
		use Sci9and10_panel_IRT , clear
		keep stu_serial schoolid theta_panel_g9_10 test
		rename theta_panel_g9_10 theta_sci
		keep if test=="Sci09" //5833 deleted
		sort schoolid
		unique stu_serial  //8784 unique
		tempfile scores
		save "`scores'", replace  
		
		* read in student attrition data, merge in the scores
			use "`attrition'", clear
			drop _merge
			sort stu_serial
			merge 1:1 stu_serial using "`scores'"  // 9 unmatched (3 master 6 using)
			keep if _merge==3 //9
			drop _merge
				
		* rename science test score
			rename theta_sci score_s
			count if missing(score_s) // 0
			summ score_s, detail
				
		* standardize scores (subtracting off control sample mean and dividing by control sample standard deviation)
			svy, over(treat): mean score_s
			mat b= e(b)
			scalar mean=b[1,1]
			di mean
			estat sd
			mat sd = r(sd)
			scalar sd= sd[1,1]
			di sd
			gen score_s_std = (score_s-mean)/sd
								
		* average standardized science test scores among attritters by treatment/control
			bys treat attrit_all: summ(score_s_std)  // -.25  -.31

		* Test of equality of baseline math test score average among attritters
			
			svy: reg score_s_std treat district#stratum if attrit_all==1 
			test treat   // p-val 0.160

log close
	
			
